{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b70ae68c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import our programs\n",
    "import requests\n",
    "import re\n",
    "from bs4 import BeautifulSoup\n",
    "from collections import Counter\n",
    "import pandas as pd\n",
    "from urllib import request,response\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a7228a39",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the data\n",
    "\n",
    "# Upload the original (res_clean) dataset\n",
    "df = pd.read_csv('/Users/alenasmith/Library/CloudStorage/GoogleDrive-asvinod@stanford.edu/Shared drives/Johnson Congress Project/Replication Files/Dictionary/Dictionary Data/res_clean.csv', dtype=object, encoding='latin-1')\n",
    "\n",
    "# Clean the data\n",
    "\n",
    "# remove duplicates\n",
    "df = df.drop_duplicates()\n",
    "\n",
    "# remove any speech that has less than 350 characters\n",
    "df = df[df['text'].apply(lambda x: len(x) >= 350)]\n",
    "\n",
    "# Make the text a string & lowercase\n",
    "df['text'] = df['text'].astype(str).str.lower()\n",
    "\n",
    "# remove new lines\n",
    "df['text'] = df['text'].str.replace(r'\\n',' ', regex=True) \n",
    "\n",
    "# remove two (or more) spaces in a row\n",
    "df['text'] = df['text'].str.replace(' +', ' ', regex=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "63296faf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lists of words to search for in each dictionary\n",
    "god_terms = ['god', 'jesus', 'christ',\n",
    "             'savior', 'saviour', 'our lord', 'the lord',\n",
    "             'holy spirit']\n",
    "blessings_terms = ['bless', 'faith', 'pray', 'divine', 'salvation', 'worship',\n",
    "                   ' sin ', ' sins ', 'sinful', 'sinner', 'original sin',\n",
    "                   'heaven', 'holy ', 'spiritual']     \n",
    "scripture_terms = ['scripture', 'bible', 'biblical', 'psalm', 'ten commandments']\n",
    "religion_terms = ['religio', 'christian', 'gospel']\n",
    "political_terms = ['judeo',\n",
    "                   'religious freedom', 'religious libert', 'freedom of religion', \n",
    "                   'christian nation']\n",
    "all_terms = ['god', 'jesus', 'christ',\n",
    "             'savior', 'saviour', 'our lord', 'the lord',\n",
    "             'holy spirit', \n",
    "             'bless', 'faith', 'pray', 'divine', 'salvation', 'worship',\n",
    "             ' sin ', ' sins ', 'sinful', 'sinner', 'original sin',\n",
    "             'heaven', 'holy ', 'spiritual',\n",
    "             'scripture', 'bible', 'biblical', 'psalm', 'ten commandments',\n",
    "             'religio', 'christian', 'gospel',\n",
    "            'judeo']\n",
    "whistles_terms = ['pornography', \n",
    "                  'gender ideology', 'gender agenda',\n",
    "                  'pro-family', 'traditional famil', 'traditional marriage',\n",
    "                 'unborn', 'pro-life', 'protect life', 'protecting life', 'abortion agenda']\n",
    "\n",
    "\n",
    "# Create new variables to store the counts\n",
    "df['god_terms_count'] = 0\n",
    "df['blessings_terms_count'] = 0\n",
    "df['scripture_terms_count'] = 0\n",
    "df['religion_terms_count'] = 0\n",
    "df['political_terms_count'] = 0\n",
    "df['all_terms_count'] = 0\n",
    "df['whistles_terms_count'] = 0\n",
    "\n",
    "\n",
    "# Function to build regex pattern for exceptions for each word\n",
    "def build_pattern(word):\n",
    "    # Handle specific exclusions\n",
    "    if word == 'god':\n",
    "        return r'\\b(?!god forbid\\b)(?!goddard\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'christ':\n",
    "        return r'\\b(?!\\w*christmas\\b)(?!christopher\\b)(?!christian\\b)(?!christians\\b)(?!christianity\\b)(?!christmases\\b)(?!christa\\b)(?!christi\\b)(?!christie\\b)(?!christine\\b)(?!christina\\b)(?!christy\\b)(?!christmastime\\b)(?!christon\\b)(?!gilchrist\\b)(?!christiansted\\b)(?!christensen\\b)(?!christen\\b)(?!christening\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'the lord':\n",
    "        return r'\\b(?!the lords and ladies\\b)(?!lord lieutenant\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'faith':\n",
    "        return r'\\b(?!faith and credit\\b)(?!faithfully execute\\b)(?!faithfully executed\\b)(?!faithfully to execute\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'pray':\n",
    "        return r'\\b(?!prayer and pledge\\b)(?!prayer and the pledge\\b)(?!spray\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'salvation':\n",
    "        return r'\\b(?!salvation army\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'holy':\n",
    "        return r'\\b(?!\\w*holyoke\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'gospel':\n",
    "        return r'\\b(?!gospel music\\b)(?!gospel lyres\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'heaven': \n",
    "        return r'\\b(?!heaven forbid\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'protect life':\n",
    "        return r'\\b(?!protect life and property\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    elif word == 'protecting life':\n",
    "        return r'\\b(?!protecting life and property\\b)\\w*{}\\w*\\b'.format(re.escape(word))\n",
    "    else:\n",
    "        return r'\\b{}\\w*\\b'.format(re.escape(word))\n",
    "\n",
    "\n",
    "# Iterate over each list of words and count their occurrences\n",
    "for terms, column in [(god_terms, 'god_terms_count'), (blessings_terms, 'blessings_terms_count'), \n",
    "                      (scripture_terms, 'scripture_terms_count'), (religion_terms, 'religion_terms_count'),\n",
    "                      (political_terms, 'political_terms_count'), (all_terms, 'all_terms_count'),\n",
    "                     (whistles_terms, 'whistles_terms_count')]:\n",
    "    for word in terms:\n",
    "        # Use regular expressions to match variations of the word\n",
    "        pattern = build_pattern(word)\n",
    "        \n",
    "        # Count occurrences using the pattern with DOTALL flag\n",
    "        df[column] += df['text'].str.lower().str.count(pattern, flags=re.DOTALL)\n",
    "\n",
    "        \n",
    "# Making binaries off of our dictionaries as created above\n",
    "\n",
    "# Create binary variables based on counts\n",
    "df['god_terms_binary'] = (df['god_terms_count'] > 0).astype(int)\n",
    "df['blessings_terms_binary'] = (df['blessings_terms_count'] > 0).astype(int)\n",
    "df['scripture_terms_binary'] = (df['scripture_terms_count'] > 0).astype(int)\n",
    "df['religion_terms_binary'] = (df['religion_terms_count'] > 0).astype(int)\n",
    "df['political_terms_binary'] = (df['political_terms_count'] > 0).astype(int)\n",
    "df['all_terms_binary'] = (df['all_terms_count'] > 0).astype(int)\n",
    "df['whistles_terms_binary'] = (df['whistles_terms_count'] > 0).astype(int)\n",
    "\n",
    "\n",
    "\n",
    "# Export the final data (NOTE the further instructions below)\n",
    "\n",
    "df.to_csv('df_dict.csv', index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d419fde",
   "metadata": {},
   "source": [
    "Minor edits to do after exporting data:\n",
    "- Remove \"anti-abortion agenda\" (4 instances - rows 319, 338, 4553, 5749). Change the 1 in whistles_terms_count and whistles_terms_binary to 0. This will make the entire row be 0's for all these rows.\n",
    "- Add to \" sin.\" (row 15936 (increase by 2)) \" sin,\" (rows 6023, 8728, 19325, 19535) \" sins.\" (rows 15936 (increase by 2), 16754) \" sins,\" (rows 2513, 7141). Note that for all of these you will increase the blessings_terms_count and the all_terms_count by 1, and if blessings_terms_binary or all_terms_binary are 0 you will change those to 1.\n",
    "- Note that while rows 4444, 10069, and 17344 include \" sin\" it is \"original sin\", which we've already included in our dictionary. Hence they are not included in any of the lists above, even though a search for these terms will include them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d549058",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
